{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clustering about Event Recommendation Engine Challenge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#导入工具包\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "from sklearn import metrics\n",
    "from sklearn.preprocessing import normalize\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "import csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1、数据准备"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 在提供的0.EDA代码中完成对数据的探索，events活动样本太大，抽取出只在训练集和测试集中出现的events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#初始化events集合，遍历训练集测试集events添加即可\n",
    "uniqueEvents = set()\n",
    "\n",
    "for filename in [\"train.csv\", \"test.csv\"]:\n",
    "    f = open(filename, 'r')    \n",
    "    \n",
    "    f.readline().strip().split(',') #忽略第一行（列名字）\n",
    "    \n",
    "    for line in f:    #对每条记录\n",
    "        cols = line.strip().split(\",\")\n",
    "        uniqueEvents.add(cols[1])   #第二列为活动ID\n",
    "       \n",
    "    f.close() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "13418"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(uniqueEvents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "f=open('events.csv','r')\n",
    "columns=f.readline().strip().split(\",\")\n",
    "with open(\"events_filt.csv\",\"w\") as csvfile: \n",
    "    writer = csv.writer(csvfile)\n",
    "    writer.writerow(columns)\n",
    "        \n",
    "    for line in f:   #对每条记录\n",
    "        cols = line.strip().split(\",\")\n",
    "        if (cols[0] in uniqueEvents):\n",
    "            writer.writerow(cols)          \n",
    "f.close() "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2、聚类（K值选取与评价）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data= pd.read_csv(\"events_filt.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#去掉无用特征，保留聚类的101维特征\n",
    "data_train = data.drop([\"event_id\",\"user_id\",\"start_time\",\"city\",\"state\",\"zip\",\"country\",\"lat\",\"lng\"],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#将dataframe转化为ndarray，同时将样本正规化\n",
    "data_train=data_train.as_matrix()\n",
    "data_train=normalize(data_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#聚类分析，输入MiniBatchKMeans中的聚类数量和特征样本，返回聚类评价值CH索引\n",
    "def K_cluster_analysis(K, data_train):   \n",
    "    #实例化  \n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    #训练\n",
    "    mb_kmeans.fit(data_train)\n",
    "    \n",
    "    # K值的评估标准\n",
    "    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    \n",
    "    CH_score = metrics.calinski_harabaz_score(data_train,mb_kmeans.predict(data_train))\n",
    "      \n",
    "    print(K,\"Clusterting ,CH_score: {}\".format(CH_score))\n",
    "    \n",
    "    return CH_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10 Clusterting ,CH_score: 845.0560668606917\n",
      "20 Clusterting ,CH_score: 442.5750075434759\n",
      "30 Clusterting ,CH_score: 363.2402644501616\n",
      "40 Clusterting ,CH_score: 277.98282474860497\n",
      "50 Clusterting ,CH_score: 287.0088004902241\n",
      "60 Clusterting ,CH_score: 241.29279217181232\n",
      "70 Clusterting ,CH_score: 189.71072351207928\n",
      "80 Clusterting ,CH_score: 198.3782883262406\n",
      "90 Clusterting ,CH_score: 155.623215149852\n",
      "100 Clusterting ,CH_score: 162.36599201283616\n"
     ]
    }
   ],
   "source": [
    "#K值空间\n",
    "Ks = [10,20,30,40,50,60,70,80,90,100]\n",
    "CH_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, data_train)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x22e32685e48>]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAHq1JREFUeJzt3XmYVNWZx/Hv2zQ7hGZpCLKkYWRUXFhsDbiiiCxFgsloonFhjIaZxCTGNTiTmSQzxuijUcNkBuOSiOgg0RhlEFDSolEDxEYMoJiBAZVWlA6yGEDC8s4f5/ZDC91UdXd136pbv8/z1FP3nrpV/XY9xa8vp849x9wdERFJrqK4CxARkealoBcRSTgFvYhIwinoRUQSTkEvIpJwCnoRkYRT0IuIJJyCXkQk4RT0IiIJVxx3AQA9evTwsrKyuMsQEckry5Yt+7O7l6Y7LieCvqysjMrKyrjLEBHJK2b2dibHqetGRCThFPQiIgmnoBcRSTgFvYhIwinoRUQSTkEvIpJwCnoRkYTL66BfvBhuuinuKkREclteB/2rr8Ktt8L//m/clYiI5K68DvpUKtw//XS8dYiI5LK8DvqyMhg8WEEvInI4eR30ABMnwu9+Bx99FHclIiK5Ke+DPpWCPXtg4cK4KxERyU15H/SnnAIlJeq+ERGpT94HfXExjB0L8+bB/v1xVyMiknvyPughdN+8/z4sXx53JSIiuScRQT9uHJip+0ZEpC6JCPrSUvjsZxX0IiJ1ySjozewaM3vdzFaZ2Swza2dmA8xsqZmtMbPZZtYmOrZttL82erysOX+BGqkUvPIKbNrUEj9NRCR/pA16M+sDfBsod/fjgFbAhcBtwF3uPgjYAlwRPeUKYIu7HwncFR3X7FIpcIf581vip4mI5I9Mu26KgfZmVgx0ADYCZwOPR4/PAM6LtidF+0SPjzYzy0659Rs6FI44Qt03IiIHSxv07v4ucAfwDiHgtwHLgK3uvjc6rAroE233ATZEz90bHd/94Nc1sylmVmlmldXV1U39PTCDCRPgmWfCBVQiIhJk0nXTlXCWPgA4AugIjK/jUK95ymEeO9Dgfq+7l7t7eWlpaeYVH0YqBdu3w8svZ+XlREQSIZOum3OA9e5e7e57gCeAU4CSqCsHoC/wXrRdBfQDiB7vAnyY1arrK/QcaNNG3TciIrVlEvTvACPMrEPU1z4aeANYBJwfHTMZeCranhPtEz3+nLsfckbfHDp1gjPPVNCLiNSWSR/9UsKXqq8CK6Pn3At8F7jWzNYS+uAfiJ7yANA9ar8WmNoMddcrlYLVq2H9+pb8qSIiucta6GT7sMrLy72ysjIrr7V2LQwaBP/xH/DNb2blJUVEcpKZLXP38nTHJeLK2NqOPBL+9m/VfSMiUiNxQQ+h+2bRItixI+5KRETil9ig370bKirirkREJH6JDPrTT4fOndV9IyICCQ36Nm1gzJiwGEkOfNcsIhKrRAY9hO6bqipYsSLuSkRE4pXYoJ8wIdyr+0ZECl1ig/7Tn4YTT1TQi4gkNughdN8sWQKbN8ddiYhIfBIf9Pv3w4IFcVciIhKfRAd9eTn07KnuGxEpbIkO+qIiGD8+nNHv3Zv+eBGRJEp00EPovtmyJfTVi4gUosQH/bnnQnGxum9EpHAlPui7dIHTTlPQi0jhSnzQQ+i+WbkS3nkn7kpERFpewQQ9hLlvREQKTUEE/dFHw4AB6r4RkcJUEEFvFs7qKypg1664qxERaVlpg97MjjKz12rdtpvZd8ysm5ktNLM10X3X6Hgzs2lmttbMVpjZ8Ob/NdJLpULIP/983JWIiLSstEHv7n9y96HuPhQ4EdgJ/AaYClS4+yCgItoHGA8Mim5TgOnNUXhDjRoFHTqo+0ZECk9Du25GA//n7m8Dk4AZUfsM4LxoexLwkAdLgBIz652VapugXTsYPToEvRYjEZFC0tCgvxCYFW33cveNANF9z6i9D7Ch1nOqorbYpVLw1luwenXclYiItJyMg97M2gCfBx5Ld2gdbYecQ5vZFDOrNLPK6urqTMtoEi1GIiKFqCFn9OOBV939g2j/g5oumeh+U9ReBfSr9by+wHsHv5i73+vu5e5eXlpa2vDKG6FfPzjhBAW9iBSWhgT9RRzotgGYA0yOticDT9VqvywafTMC2FbTxZMLUil46SXYujXuSkREWkZGQW9mHYAxwBO1mm8FxpjZmuixW6P2ecA6YC1wH/CNrFWbBakU7NsHzz4bdyUiIi2jOJOD3H0n0P2gts2EUTgHH+vAVVmprhmMGAHduoXumy99Ke5qRESaX0FcGVtbq1YwbhzMnx+WGRQRSbqCC3oI3TfV1fDKK3FXIiLS/Aoy6MeNC8sMavSNiBSCggz6bt1g5EgFvYgUhoIMegjdN6++ChtzZuCniEjzKOigBy1GIiLJV7BBf/zx0Levum9EJPkKNuhrFiNZuBB27467GhGR5lOwQQ8h6P/yF3jxxbgrERFpPgUd9KNHh3nq1X0jIklW0EHfoQOcdZaCXkSSraCDHkL3zZo14SYikkQK+miYpc7qRSSpCj7oy8pg8GAFvYgkV8EHPYSz+hdegI8+irsSEZHsU9ATgn7PHvjtb+OuREQk+xT0wCmnQJcu6r4RkWRS0AOtW8PYsWHeGy1GIiJJo6CPpFJhJsvly+OuREQkuxT0kfHjw/w36r4RkaTJKOjNrMTMHjezN81stZmNNLNuZrbQzNZE912jY83MppnZWjNbYWbDm/dXyI7SUjj5ZAW9iCRPpmf0PwUWuPvRwBBgNTAVqHD3QUBFtA8wHhgU3aYA07NacTNKpcI6sps2xV2JiEj2pA16M/sUcAbwAIC7/9XdtwKTgBnRYTOA86LtScBDHiwBSsysd9YrbwapFLjD/PlxVyIikj2ZnNEPBKqBX5rZcjO738w6Ar3cfSNAdN8zOr4PsKHW86uitpw3bBj07q3uGxFJlkyCvhgYDkx392HADg5009TF6mjzQw4ym2JmlWZWWV1dnVGxzc0MJkyAZ54JF1CJiCRBJkFfBVS5+9Jo/3FC8H9Q0yUT3W+qdXy/Ws/vC7x38Iu6+73uXu7u5aWlpY2tP+tSKdi+HV5+Oe5KRESyI23Qu/v7wAYzOypqGg28AcwBJkdtk4Gnou05wGXR6JsRwLaaLp58cM454QIqdd+ISFIUZ3jct4BHzKwNsA64nPBH4ldmdgXwDnBBdOw8YAKwFtgZHZs3OneGM88MQX/77XFXIyLSdBkFvbu/BpTX8dDoOo514Kom1hWrVAquuQbWr4cBA+KuRkSkaXRlbB20GImIJImCvg6DBoWbgl5EkkBBX49UChYtgh074q5ERKRpFPT1SKVg92547rm4KxERaRoFfT3OOAM6dVL3jYjkPwV9Pdq0gTFjQtD7Idf1iojkDwX9YaRSUFUFK1fGXYmISOMp6A9jwoRwr+4bEclnCvrD6N0bhg9X0ItIflPQp5FKweLFsHlz3JWIiDSOgj6NVAr27w9TF4uI5CMFfRonnRTWk1X3jYjkKwV9GkVFMH48LFgA+/bFXY2ISMMp6DOQSsGHH8KSJXFXIiLScAr6DJx7LrRqpe4bEclPCvoMlJTAaacp6EUkPynoM5RKwYoVsGFD3JWIiDSMgj5DNYuRzJsXbx0iIg2loM/QMcdAWZm6b0Qk/yjoM2QWzuorKuDjj+OuRkQkcxkFvZm9ZWYrzew1M6uM2rqZ2UIzWxPdd43azcymmdlaM1thZsOb8xdoSakU7NwJzz8fdyUiIplryBn9We4+1N3Lo/2pQIW7DwIqon2A8cCg6DYFmJ6tYuM2ahS0b6/uGxHJL03pupkEzIi2ZwDn1Wp/yIMlQImZ9W7Cz8kZ7dvD6NFajERE8kumQe/As2a2zMymRG293H0jQHTfM2rvA9QehFgVtX2CmU0xs0ozq6yurm5c9TFIpWD9enjzzbgrERHJTKZBf6q7Dyd0y1xlZmcc5liro+2Q8193v9fdy929vLS0NMMy4qfFSEQk32QU9O7+XnS/CfgNcDLwQU2XTHS/KTq8CuhX6+l9gfeyVXDc+veH449X0ItI/kgb9GbW0cw612wD5wKrgDnA5OiwycBT0fYc4LJo9M0IYFtNF09SpFLw0kuwbVvclYiIpJfJGX0v4CUz+yPwB+Bpd18A3AqMMbM1wJhoH2AesA5YC9wHfCPrVccslYK9e+HZZ+OuREQkveJ0B7j7OmBIHe2bgdF1tDtwVVaqy1EjRkDXrqH75oIL4q5GROTwdGVsIxQXw7hxMH9+WGZQRCSXKegbKZWCTZugsjLuSkREDk9B30jjxoVlBjX6RkRynYK+kbp3h5EjFfQikvsU9E2QSsGyZbAxUYNHRSRpFPRNULMYyfz58dYhInI4CvomOP546NsX5s6NuxIRkfop6JugZjGShQth9+64qxERqZuCvolSKfjLX+DFF+OuRESkbgr6Jjr7bGjbVqNvRCR3KeibqGPHEPYPPgiPPx53NSIih1LQZ8G0afA3fxPmvbnwQti8Oe6KREQOUNBnwZFHwuLFcPPN8MQTcOyx8NRT6Z8nItISFPRZ0ro1/PM/h7lveveG886DSy+FDz+MuzIRKXQK+iw74QRYuhS+/3149FE47jh9USsi8VLQN4M2beAHPwiB36MHTJwIX/0qbN0ad2UiUogU9M1o+HB45ZXQpfPQQ+FK2meeibsqESk0Cvpm1rZt+JJ28WLo3DlMbzxlCmzfHndlIlIoFPQt5KST4NVX4cYb4YEHwtl9RUXcVYlIIcg46M2slZktN7O50f4AM1tqZmvMbLaZtYna20b7a6PHy5qn9PzTrh3cdhu89FLYPucc+MY3whQKIiLNpSFn9FcDq2vt3wbc5e6DgC3AFVH7FcAWdz8SuCs6TmoZORJeew2uvRbuuSeM1HnhhbirEpGkyijozawvkALuj/YNOBuoueh/BnBetD0p2id6fHR0vNTSvj385Cch4IuKYNQouPpq2LEj7spEJGkyPaO/G7gR2B/tdwe2uvveaL8K6BNt9wE2AESPb4uOlzqcfjr88Y/wrW+FqRSGDoWXX467KhFJkrRBb2YTgU3uvqx2cx2HegaP1X7dKWZWaWaV1dXVGRWbVB07hpBftAj27g3hf/31sGtX3JWJSBJkckZ/KvB5M3sLeJTQZXM3UGJmxdExfYH3ou0qoB9A9HgX4JCJANz9Xncvd/fy0tLSJv0SSTFqFKxcCf/wD6FbZ9gwWLIk7qpEJN+lDXp3v8nd+7p7GXAh8Jy7XwwsAs6PDpsM1EzjNSfaJ3r8OXc/5Ixe6tapE0yfDs8+Czt3wqmnwtSp8PHHcVcmIvmqKePovwtca2ZrCX3wD0TtDwDdo/ZrgalNK7EwjRkDq1aFqRNuuw1OPDFMmCYi0lCWCyfb5eXlXqkUq9eCBXDllfD+++Hs/l/+JVxxKyKFzcyWuXt5uuN0ZWweGDcunN1fein86EfhKtvly+OuSkTyhYI+T5SUwC9/CXPmQHU1nHwy/PCHsGdP3JWJSK5T0OeZz30OXn8dvvzlMBXyZz8bRuqIiNRHQZ+HunWDhx8Oyxa++274ovaWW8IYfBGRgyno89gXvhDO7r/whTDn/ciR8MYbcVclIrlGQZ/nevSA2bPDbf36cHZ/zz2QA4OpRCRHKOgT4ktfCiNzzjgDvv51+Lu/08LkIhIo6BPk05+G+fPh9tth7lwYMkTTH4uIgj5xiorChGiLF4epkM86K1xgpS9qRQqXgj6hTjwxLF34938f1qw94wx46624qxKROCjoE6xTJ/jFL2DWrDA6Z8gQePTRuKsSkZamoC8AF14Yli489li46CK4/HKtUytSSBT0BWLAAPjd7+B734MZM2D4cFi2LP3zRCT/KegLSHEx/Pu/h5Wsdu0KF1jdcQfs35/+uSKSvxT0BejMM8M6tRMnwg03wPjxYQpkEUkmBX2B6tYNfv3rcBXtiy/CCSfAvHlxVyUizUFBX8DMwvq0lZXhYqtUCq65BnbvjrsyEckmBb0weDD84Q/wrW/B3XfDiBHw5ptxVyUi2aKgFwDatYNp0+B//geqqsIFV/ffr8nRRJJAQS+fMHFi+KJ25Ej42tfCZGlbtsRdlYg0RdqgN7N2ZvYHM/ujmb1uZj+M2geY2VIzW2Nms82sTdTeNtpfGz1e1ry/gmTbEUfAs8/CbbfBk0+GK2pfeinuqkSksTI5o98NnO3uQ4ChwDgzGwHcBtzl7oOALcAV0fFXAFvc/Ujgrug4yTNFRXDjjfD730ObNmFI5g9+oMnRRPJR2qD3oOaC+dbRzYGzgcej9hnAedH2pGif6PHRZmZZq1ha1EknwfLlcMklYTHyUaPg7bfjrkpEGiKjPnoza2VmrwGbgIXA/wFb3b3m/K4K6BNt9wE2AESPbwO6Z7NoaVmdO4dpEx55BFasCF05jz0Wd1UikqmMgt7d97n7UKAvcDJwTF2HRfd1nb0fMnbDzKaYWaWZVVZXV2dar8ToK18Jk6MddVT4kvbKK2HHjrirEpF0GjTqxt23As8DI4ASMyuOHuoLvBdtVwH9AKLHuwCHLGrn7ve6e7m7l5eWljauemlxAweGL2ZvuilMgXziiaFrR0RyVyajbkrNrCTabg+cA6wGFgHnR4dNBp6KtudE+0SPP+eu0dhJ0ro13HIL/Pa38NFH4QKru+6KZ3I09zDl8ttvh9k4n3kG/vu/wzUBt9wS2vTpk0Jn6TLYzE4gfLnaivCH4Vfu/m9mNhB4FOgGLAcucffdZtYOmAkMI5zJX+ju6w73M8rLy72ysrLJv4y0vM2b4Yor4KmnYNw4ePBB6NWrca9VE9qbN8Of/xxudW0f3JZuyoYhQ0I308UXQ9eujatNJBeZ2TJ3L097XC6cbCvo85s7TJ8O110HXbqEL27PPTec7Tc0tP/617p/hlmYiK1HD+jePdzX3j74vkeP8JzZs+GBB8Kyim3bwhe/GP4wnXVWGEIqks8U9NLiVq0Kq1m9/nro3tmzp+7jiopCINcVzvW1lZRAq1aNr2358hD4jzwCW7eGhVi++tWwpm7fvo1/XZE4KeglFrt2wU9/Ctu21X/GXVIS39n0rl3wm9+E0H/uuVDH2LHhLP9znwsXh4nkCwW9SBrr1oWRQw8+CO++C6WlcNllIfSPqWsAsUiOyTTo1UspBWvgQLj55jBi5+mn4fTTw/9GBg+GU04JZ/1aRF2SQEEvBa9VK5gwIay4VVUFt98eZuy88kro3TvcL16sYZqSvxT0IrX06gXXXw9vvAEvvxyuAH700XCGf+yxcOedoAu5Jd8o6EXqYHag+2bjRrjvvjB09LrroE8fOP98mD8f9u2Lu1KR9BT0Iml07nyg+2bVqrDk4gsvhO6esjL413+F9evjrlKkfgp6kQY49lj4yU/CKJ3HHoPjjgtf6A4cCOecA7Nmwccfx12lyCcp6EUaoU2bA903b70V5upfuzbM8HnEEfDtb4clGUVygYJepIn69w/dN+vWwcKF4QKsn/8chg6F8nK45x4N05R4KehFsqSo6ED3zXvvhTH5e/bA178O/frBDTdodS6Jh4JepBl07x66b157LQzTPPfcMJXzwIFwwQWhTePypaUo6EWaUc0wzdmzw8icG26Aigo47TQ4+eQwyVp9M3aKZIuCXqSF9OsHt94KGzbAf/1XmMb5kkvCEM0f/ShM1SzSHBT0Ii2sY8fQb//GGzBvHpxwAnzve+EPwde+Fsbqi2STgl4kJkVFMH48LFgQ5vCfPDl05Rx/PIwZEyZai2N5RkkeBb1IDhg8OAzD3LABfvxjWL0aJk6Eo4+G//xPDc+UplHQi+SQ7t1h6tTwxe2sWWGN229+M6yCpeGZ0lgKepEc1Lp1WJZx6dIwx864cRqeKY2XNujNrJ+ZLTKz1Wb2upldHbV3M7OFZrYmuu8atZuZTTOztWa2wsyGN/cvIZJkI0aEqZI1PFMaK5Mz+r3Ade5+DDACuMrMBgNTgQp3HwRURPsA44FB0W0KMD3rVYsUoNrDM6dPD/32tYdnap58qU/aoHf3je7+arT9EbAa6ANMAmZEh80Azou2JwEPebAEKDGz3lmvXKRAdewI//iPYaTOggUwZEgYntm/f24Pz3QPi8brfyAtr7ghB5tZGTAMWAr0cveNEP4YmFnP6LA+wIZaT6uK2jYe9FpTCGf89O/fvxGlixS2oqIwgdrYsWGUzrRpMGMG3H9/mHPnO98JwzeLmuGbuI8/hg8/hM2bD70drn3fvrCAywUXhP+NnH5689Qnn2Se4Tc6ZtYJeAH4kbs/YWZb3b2k1uNb3L2rmT0N/NjdX4raK4Ab3X1Zfa9dXl7ulZWVTfpFRCSE6X33wc9+Fta/HTQIrr46jNHv1OnQ4/ftg61b0wf1wW07d9ZfQ/v2YfRQt27hvvata1dYuTKsz7tjB3zmM3DxxXDppWEoqTSMmS1z9/K0x2US9GbWGpgLPOPud0ZtfwJGRWfzvYHn3f0oM/t5tD3r4OPqe30FvUh27dkDTzwBd98NS5aEs+ixY0O/fu3w3rKl/tE7RUWHhnVd4X1wW/v26evbsQOefBJmzgxTO+/fH6Z0vvTSMNqoZ8/0ryFZDHozM0If/Ifu/p1a7bcDm939VjObCnRz9xvNLAV8E5gAfBaY5u4nH+5nKOhFms+SJWHK5KVLwxn1wUFdX4B/6lMt062ycWMYVTRzJixfDq1ahT9Kl14KkyZl9oejUGUz6E8DXgRWAjUXZP8ToZ/+V0B/4B3gAnf/MPrD8DNgHLATuNzdD5viCnoRgfAF88yZYdhoVVVYr/f880Pon3mm+vMPltWum+amoBeR2vbvDwuwz5wJjz8eZvrs2/dAf/6xx8ZdYW7INOj191FEck5REZx1FvziF/D++2E6iCFD4I47woLsw4bBnXeGxyQ9Bb2I5LQOHcIXtHPnHliisbgYrrsO+vQJ00M88kj4glfqpqAXkbzRs2dYovGVV8K1AzfdBG++Gcbk9+oFl10WRvHs2xd3pblFQS8ieenoo+Hmm2HdutCff9FFMGdOWJ+3f/8wL9CKFXFXmRv0ZayIJMbHH4cunpkzw+pde/eGFbwuuQS+8pXQ1dNc9uyB7dvDNA/btmW+ff31YRhpY2T6ZWyDpkAQEcll7dqF4Zjnnx/W4J09Gx5+GG68Eb77XRg9Ooza+eIXD1wp7B7692uHcGMCe9eu9PW1bRuuT+jSJdxa6loFndGLSOKtWRMC/+GHQ1dPhw5QWnogpNMt2WgWxvTXhHPt+4Zst22b3d9L4+hFRA7iDr//fbgSd/v2QwO5vpDu1Ck3L9ZS142IyEHM4NRTw62Q5ODfKBERySYFvYhIwinoRUQSTkEvIpJwCnoRkYRT0IuIJJyCXkQk4RT0IiIJlxNXxppZNfB23HU0UQ/gz3EXkUP0fhyg9+KT9H58UlPej8+4e2m6g3Ii6JPAzCozuRS5UOj9OEDvxSfp/fiklng/1HUjIpJwCnoRkYRT0GfPvXEXkGP0fhyg9+KT9H58UrO/H+qjFxFJOJ3Ri4gknIK+Ecysn5ktMrPVZva6mV0dtXczs4Vmtia67xp3rS3FzFqZ2XIzmxvtDzCzpdF7MdvM2sRdY0sxsxIze9zM3ow+IyML9bNhZtdE/0ZWmdksM2tXSJ8NM/uFmW0ys1W12ur8LFgwzczWmtkKMxuerToU9I2zF7jO3Y8BRgBXmdlgYCpQ4e6DgIpov1BcDayutX8bcFf0XmwBroilqnj8FFjg7kcDQwjvS8F9NsysD/BtoNzdjwNaARdSWJ+NB4FxB7XV91kYDwyKblOA6Vmrwt11a+INeAoYA/wJ6B219Qb+FHdtLfT7940+sGcDcwEjXABSHD0+Engm7jpb6L34FLCe6PuvWu0F99kA+gAbgG6E1ezmAmML7bMBlAGr0n0WgJ8DF9V1XFNvOqNvIjMrA4YBS4Fe7r4RILrvGV9lLepu4EagZonl7sBWd98b7VcR/tEXgoFANfDLqCvrfjPrSAF+Ntz9XeAO4B1gI7ANWEbhfjZq1PdZqPnDWCNr742CvgnMrBPwa+A77r497nriYGYTgU3uvqx2cx2HFsrwrmJgODDd3YcBOyiAbpq6RH3Pk4ABwBFAR0L3xMEK5bORTrP9u1HQN5KZtSaE/CPu/kTU/IGZ9Y4e7w1siqu+FnQq8Hkzewt4lNB9czdQYmY1i8/3Bd6Lp7wWVwVUufvSaP9xQvAX4mfjHGC9u1e7+x7gCeAUCvezUaO+z0IV0K/WcVl7bxT0jWBmBjwArHb3O2s9NAeYHG1PJvTdJ5q73+Tufd29jPBF23PufjGwCDg/Oqwg3gsAd38f2GBmR0VNo4E3KMDPBqHLZoSZdYj+zdS8FwX52ailvs/CHOCyaPTNCGBbTRdPU+mCqUYws9OAF4GVHOiX/idCP/2vgP6ED/kF7v5hLEXGwMxGAde7+0QzG0g4w+8GLAcucffdcdbXUsxsKHA/0AZYB1xOOKkquM+Gmf0Q+DJhpNpy4EpCv3NBfDbMbBYwijBD5QfA94EnqeOzEP0x/BlhlM5O4HJ3r8xKHQp6EZFkU9eNiEjCKehFRBJOQS8iknAKehGRhFPQi4gknIJeRCThFPQiIgmnoBcRSbj/B3Fx+RPywW6cAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x22e32424d68>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同聚类数目下的性能，找到最佳模型／参数（分数最高）\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 从图中可以看出最优值在最左侧，因此将K值空间调整为从2开始"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2 Clusterting ,CH_score: 2176.211182591895\n",
      "3 Clusterting ,CH_score: 1343.4356201051035\n",
      "4 Clusterting ,CH_score: 1538.5630431852817\n",
      "5 Clusterting ,CH_score: 1248.824679870045\n",
      "6 Clusterting ,CH_score: 1145.3274277060573\n",
      "7 Clusterting ,CH_score: 1058.4706655417199\n",
      "8 Clusterting ,CH_score: 887.8832355623327\n",
      "9 Clusterting ,CH_score: 819.3819383083215\n",
      "10 Clusterting ,CH_score: 814.3632666189527\n",
      "20 Clusterting ,CH_score: 412.13331702361427\n",
      "30 Clusterting ,CH_score: 404.5181553580213\n",
      "40 Clusterting ,CH_score: 307.86323076972957\n"
     ]
    }
   ],
   "source": [
    "#K值空间\n",
    "Ks = [2,3,4,5,6,7,8,9,10,20,30,40]\n",
    "CH_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, data_train)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x22e32916908>]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYEAAAD8CAYAAACRkhiPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAH+1JREFUeJzt3XuUXFWZ/vHvQ0IiMUQitAGSQIIGBRwJ0CByEwhKoiwJDiiIXGKcSIQZiJefgDMDchlFwEtQYYBEwEtQuUi4CGYAQWcg0IFAAkGTcA3EpDWIjGA0yfv7Y58eKt3V1Z2u7jrVdZ7PWrWqatepqrdrLXhy9j57b0UEZmZWTJvlXYCZmeXHIWBmVmAOATOzAnMImJkVmEPAzKzAHAJmZgXmEDAzKzCHgJlZgXUZApJGS7pX0hJJT0g6PWu/WNJTkh6XdLOkrbL2MZJel7Qwu11R8ll7SVokaZmkmZLUd3+amZl1RV3NGJa0HbBdRDwiaUtgATAZGAXcExHrJF0EEBFfkjQGuC0i3l3msx4CTgceBO4AZkbELyp9/zbbbBNjxozZ1L/LzKywFixY8IeIaOrOsQO7OiAiVgIrs8evSloCjIyIX5Yc9iBwdKXPycJkWEQ8kD2/jhQmFUNgzJgxtLS0dFWmmZllJD3X3WM3aUwg+1f+HsD8di99io3/Zz5W0qOS7pN0YNY2ElhRcsyKrK3c90yT1CKppbW1dVNKNDOzTdDtEJA0FLgROCMi/lzS/mVgHfCjrGklsENE7AF8DvixpGFAuf7/sn1REXFlRDRHRHNTU7fOaMzMrAe67A4CkLQ5KQB+FBE3lbSfBBwBTIhscCEi1gJrs8cLJC0Hdib9y39UyceOAl7qjT/CzMx6pjtXBwmYBSyJiG+UtE8EvgR8JCJeK2lvkjQge7wTMA54OhtbeFXSvtlnngjc0qt/jZmZbZLunAnsD5wALJK0MGs7G5gJDAbmZVd6PhgRpwAHAedJWgesB06JiDXZ+6YD1wBbkMYQKg4Km5lZ3+rO1UG/oXx//h2dHH8jqeuo3GstQIdLR83MLB+eMWxmVmANGQIRcMEFcNddeVdiZlbfGjIEJLj4Yrj99rwrMTOrbw0ZAgAjRsCqVXlXYWZW3xwCZmYF1rAhsO228Pvf512FmVl9a9gQ8JmAmVnXGjoE/vQnWLs270rMzOpXw4bAttum+9Wr863DzKyeNWwIjBiR7j0uYGbWuYYPAY8LmJl1ziFgZlZgDR8C7g4yM+tcw4bAFlvAsGE+EzAzq6RhQwA8V8DMrCsOATOzAmvoEPDSEWZmlXVnj+HRku6VtETSE5JOz9rfKmmepKXZ/fCsXZJmSlom6XFJe5Z81knZ8UuzTer7lM8EzMwq686ZwDrg8xGxC7AvcKqkXYEzgbsjYhxwd/YcYBJpc/lxwDTgckihAZwDvBfYBzinLTj6ipeOMDOrrMsQiIiVEfFI9vhVYAkwEjgSuDY77Fpgcvb4SOC6SB4EtpK0HXA4MC8i1kTEy8A8YGKv/jXteOkIM7PKNmlMQNIYYA9gPjAiIlZCCgrgbdlhI4EXSt62ImvrrL3PeK6AmVll3Q4BSUOBG4EzIuLPlQ4t0xYV2st91zRJLZJaWltbu1tiB541bGZWWbdCQNLmpAD4UUTclDWvyrp5yO7bOl1WAKNL3j4KeKlCewcRcWVENEdEc1NTU3f/lg4cAmZmlXXn6iABs4AlEfGNkpfmAm1X+JwE3FLSfmJ2ldC+wCtZd9FdwAclDc8GhD+YtfUZdweZmVU2sBvH7A+cACyStDBrOxv4GvBTSVOB54FjstfuAD4ELANeA6YARMQaSecDD2fHnRcRa3rlr+iEl44wM6usyxCIiN9Qvj8fYEKZ4wM4tZPPmg3M3pQCq+W5AmZmnWvoGcPgEDAzq6ThQ8BLR5iZda7hQ8BnAmZmnStECHjpCDOz8ho+BNqWjvDZgJlZRw0fAp4wZmbWOYeAmVmBOQTMzAqsMCHgy0TNzDpq+BDw0hFmZp1r+BAAzxUwM+uMQ8DMrMAKEQJeOsLMrLxChIDPBMzMyitMCHjpCDOzjgoRAl46wsysvEKEgCeMmZmV5xAwMyuw7mw0P1vSakmLS9p+Imlhdnu2be9hSWMkvV7y2hUl79lL0iJJyyTNzDawrwmHgJlZed3ZaP4a4DvAdW0NEfHxtseSLgVeKTl+eUSML/M5lwPTgAdJm9FPBH6x6SVvOi8dYWZWXpdnAhFxP7Cm3GvZv+Y/Bsyp9BmStgOGRcQD2Ub01wGTN73cnvHSEWZm5VU7JnAgsCoilpa0jZX0qKT7JB2YtY0EVpQcsyJrK0vSNEktklpaW1urLDHxXAEzs46qDYHj2PgsYCWwQ0TsAXwO+LGkYUC5/v/o7EMj4sqIaI6I5qampipLTLbd1iFgZtZed8YEypI0EPgosFdbW0SsBdZmjxdIWg7sTPqX/6iSt48CXurpd/fEiBGwaFEtv9HMrP5VcyZwGPBURPxfN4+kJkkDssc7AeOApyNiJfCqpH2zcYQTgVuq+O5N5u4gM7OOunOJ6BzgAeCdklZImpq9dCwdB4QPAh6X9BhwA3BKRLQNKk8HrgaWAcup0ZVBbbx0hJlZR112B0XEcZ20n1ym7Ubgxk6ObwHevYn19ZrSpSN22CGvKszM6kshZgyDJ4yZmZXjEDAzKzCHgJlZgRUuBLx0hJnZGwoTAl46wsyso8KEAHiugJlZe4UKAW84b2a2sUKFgM8EzMw25hAwMyuwwoWAl44wM3tDoUKgdOkIMzMrWAh4wpiZ2cYcAmZmBeYQMDMrsEKGgOcKmJklhQoBLx1hZraxQoUAdG+uwLp1sNtucPHFtanJzCwv3dlecrak1ZIWl7SdK+lFSQuz24dKXjtL0jJJv5V0eEn7xKxtmaQze/9P6Z7uLB3xi1/Ak0/CT35Sm5rMzPLSnTOBa4CJZdq/GRHjs9sdAJJ2Je09vFv2nu9JGpBtPv9dYBKwK3BcdmzNdedM4Oqr0/0jj8DLL/d9TWZmeekyBCLifmBNV8dljgSuj4i1EfEMaVP5fbLbsoh4OiL+BlyfHVtzXYXAypVw++1wwAEQAb/6Vc1KMzOruWrGBE6T9HjWXTQ8axsJvFByzIqsrbP2mutq6YjrroP16+GKK2DIELj77trWZ2ZWSz0NgcuBtwPjgZXApVm7yhwbFdrLkjRNUoukltbW1h6WWF6lpSMiYNYsOPDANDB80EEOATNrbD0KgYhYFRHrI2IDcBWpuwfSv/BHlxw6CnipQntnn39lRDRHRHNTU1NPSuxUpQljv/41LF0Kn/50ej5hAjz1FLz4Yq+WYGZWN3oUApK2K3l6FNB25dBc4FhJgyWNBcYBDwEPA+MkjZU0iDR4PLfnZfdcpRCYNSvNIzj66PR8woR0f889tanNzKzWBnZ1gKQ5wMHANpJWAOcAB0saT+rSeRb4DEBEPCHpp8CTwDrg1IhYn33OacBdwABgdkQ80et/TTd0FgKvvAI/+xmceGIaCwDYfXfYeuvUJXTCCbWt08ysFroMgYg4rkzzrArHXwhcWKb9DuCOTaquD3S2dMScOfD66zB16httm20GhxySQiACVG5kw8ysHyvcjOHOlo6YNQve8x5obt64/dBDYcUKWLasdjWamdVK4UIAOs4VeOwxaGlJZwHt/7XfNi7gq4TMrBEVMgTaLx0xaxYMGgTHH9/x2HHjYNQoh4CZNaZChkDpmcBf/wo//CF89KNpELg9KZ0N3HsvbNhQ2zrNzPpa4UPg5pvT+kClA8LtTZgAf/xj6jYyM2skhQ2BtqUjZs2CMWPSAHBnPC5gZo2qkCHQtnTE/Pnpf+xTpqTLQTuz/fbwrnc5BMys8RQyBNrmCnz1q6nPf8qUrt8zYQLcfz/87W99W5uZWS0VOgTuvBMOPxxGj658PKQQeO21dPZgZtYoCh0CUHlAuNTBB6cuI68jZGaNpNAhsM028JGPdO89w4fDnnt6XMDMGkshQ2CLLdIksOnT0ySx7powAR58EP7yl76rzcyslgoZApA2kj/33E17z6GHwt//nvYdMDNrBIUNgYEDK18WWs4BB6QzB3cJmVmjKGwI9MSQIfC+9zkEzKxxOAQ20YQJsHBhWkbCzKy/cwhsogkT0gYz996bdyVmZtVzCGyivfeGoUM9X8DMGkOXISBptqTVkhaXtF0s6SlJj0u6WdJWWfsYSa9LWpjdrih5z16SFklaJmmm1D83a9x8c3j/+z0uYGaNoTtnAtcAE9u1zQPeHRHvAX4HnFXy2vKIGJ/dTilpvxyYBozLbu0/s9+YMAF+97u07aSZWX/WZQhExP3AmnZtv4yIddnTB4FRlT5D0nbAsIh4ICICuA6Y3LOS8+elpc2sUfTGmMCngF+UPB8r6VFJ90k6MGsbCZT+u3lF1laWpGmSWiS1tLa29kKJvevd705bTl56Kbz+et7VmJn1XFUhIOnLwDrgR1nTSmCHiNgD+BzwY0nDgHL9/9HZ50bElRHRHBHNTU1N1ZTYJzbbDK68EhYtgi98Ie9qzMx6rschIOkk4Ajg+KyLh4hYGxF/zB4vAJYDO5P+5V/aZTQKeKmn310PJk2Cz38evve9tEWlmVl/1KMQkDQR+BLwkYh4raS9SdKA7PFOpAHgpyNiJfCqpH2zq4JOBG6puvqc/cd/QHMzfOpT8PzzeVdjZrbpunOJ6BzgAeCdklZImgp8B9gSmNfuUtCDgMclPQbcAJwSEW2DytOBq4FlpDOE0nGEfmnQIJgzB9atg098It2bmfUnynpy6lZzc3O0tLTkXUZFP/4xHH88/Nu/wXnn5V2NmRWdpAUR0dydYz1juBd84hNw8slwwQVeTsLM+heHQC+57DLYeWf45CfhD3/Iuxozs+5xCPSSoUPh+utTAEyZkhaZMzOrdw6BXjR+PFxyCdx2G8ycmXc1ZmZdcwj0stNOS5vXf/GLsGBB3tWYmVXmEOhlEsyeDSNGwOTJ8OKLeVdkZtY5h0Af2HpruPVW+NOf4MMfhj//Oe+KzMzKcwj0kfHj4YYbYPFi+NjH4O9/z7siM7OOHAJ96PDD4T//E+66Cz77WV8xZGb1Z2DeBTS6qVPhmWfgwgth7Fg4++y8KzIze4NDoAbOPx+efRa+/GUYMybNMDYzqwcOgRqQYNasdKXQlCkwcmTap9jMLG8eE6iRwYPhppvg7W9Pl44uWZJ3RWZmDoGaGj4c7rgjBcKkSfD73+ddkZkVnUOgxsaMSctKtLbC6afnXY2ZFZ1DIAfNzXDqqWkewXPP5V2NmRWZQyAn//zPacD4ssvyrsTMiqxbISBptqTVkhaXtL1V0jxJS7P74Vm7JM2UtEzS45L2LHnPSdnxS7ON6gtr9Gg45hi46iovK2Fm+enumcA1wMR2bWcCd0fEOODu7DnAJNIG8+OAacDlkEIDOAd4L7APcE5bcBTVjBkpAGbPzrsSMyuqboVARNwPrGnXfCRwbfb4WmBySft1kTwIbCVpO+BwYF5ErImIl4F5dAyWQtlnHzjgAPj2t2H9+ryrMbMiqmZMYERErATI7t+WtY8EXig5bkXW1ll7oc2YkWYT//zneVdiZkXUFwPDKtMWFdo7foA0TVKLpJbW1tZeLa7eHHlkWlPom9/MuxIzK6JqQmBV1s1Ddr86a18BjC45bhTwUoX2DiLiyohojojmpqamKkqsfwMGpPkC//3fMH9+3tWYWdFUEwJzgbYrfE4CbilpPzG7Smhf4JWsu+gu4IOShmcDwh/M2grvU5+CYcN8NmBmtdfdS0TnAA8A75S0QtJU4GvAByQtBT6QPQe4A3gaWAZcBXwWICLWAOcDD2e387K2wttyS/inf0qTx55/Pu9qzKxIFHW+00lzc3O0tLTkXUafe+65tLjcjBlw8cV5V2Nm/ZmkBRHR3J1jPWO4Tuy4Ixx9dJo89uqreVdjZkXhEKgjM2bAK6/A97+fdyVmVhQOgTry3vfCfvvBt77lyWNmVhsOgTozY0bak3ju3LwrMbMicAjUmcmT054D3/hG3pWYWRE4BOrMwIHwL/8Cv/kNPPxw3tWYWaNzCNShqVPT5LGpU+GlsnOqzcx6h0OgDg0bBjfemMYG9tsPfve7vCsys0blEKhThx0G994Lr70G++8PDz2Ud0Vm1ogcAnWsuTktLLfllnDooXCXV1oys17mEKhz48bB//wPvOMdcMQR8MMf5l2RmTUSh0A/sO22cN99aReyE06ASy/NuyIzaxQOgX7iLW+BO+9M6wt94QvwxS/Chg15V2Vm/d3AvAuw7hs8GK6/Pm1Cc8klaY2hrbeGrbaC4cM73iZMgD32yLtqM6tnDoF+ZsAAuOwy2GuvtBPZyy+n25o1sHz5G8/bzhI+/nE4//w0tmBm1p73E2hAEfCHP8DMmWn5ibVr4dOfhn//d9h++7yrM7O+5v0ECk6CpqZ0BrB8OUyfDrNnp01rzjwznSmYmUEVISDpnZIWltz+LOkMSedKerGk/UMl7zlL0jJJv5V0eO/8CVbJttum7qOnnkqDyl//OowdC1/9KvzlL3lXZ2Z563EIRMRvI2J8RIwH9gJeA27OXv5m22sRcQeApF2BY4HdgInA9yQNqK58666ddoIf/AAWLoQDD4Szz05zD664Av7+97yrM7O89FZ30ARgeUQ8V+GYI4HrI2JtRDxD2oh+n176fuum97wHbr0Vfv3r1D00fTrssku66siXnJoVT2+FwLHAnJLnp0l6XNJsScOztpHACyXHrMjaLAcHHJCC4LbbYMgQOO64dMXRnXemgWUzK4aqQ0DSIOAjwM+ypsuBtwPjgZVA2/xWlXl72f/dSJomqUVSS2tra7UlWick+PCH4dFH03IUr7wCkybBIYfAAw/kXZ2Z1UJvnAlMAh6JiFUAEbEqItZHxAbgKt7o8lkBjC553yig7Gr5EXFlRDRHRHNTU1MvlGiVDBgAxx+fBo+/8x1YsiQtYT15MjzxRN7VmVlf6o0QOI6SriBJ25W8dhSwOHs8FzhW0mBJY4FxgBdIriODBsGpp6bLSi+4IC1l/Q//ACefDM9VGu0xs36rqhCQNAT4AHBTSfPXJS2S9DhwCDADICKeAH4KPAncCZwaEeur+X7rG0OHwpe/DE8/DZ//fBo03nlnOOMMWL067+rMrDd5xrB16YUX4Lzz0oSzIUNSMHzuc2kHNDOrP54xbL1q9Gi46qo0PnD44fCVr6TLS7/1LfjrX/Ouzsyq4RCwbnvXu+CGG9JWl7vvDjNmwDvfCddcA+vdsWfWLzkEbJPtvTf813/BvHnwtrfBlClpEtrPf+45Bmb9jUPAeuyww9JZwQ03pDOBo46C970PfvWrvCszs+5yCFhVJPjHf4TFi+Hqq2HFijTZbOJEeOSRvKszs644BKxXDBwIU6fC0qVw8cXpDGGvveDYY1ObmdUnh4D1qi22SHsgP/10mmtw661pgbpTToGXys4PN7M8OQSsT2y1VZp1vHx5CoBZs9LS1Wed5U1tzOqJQ8D61LbbpvWIfvtb+OhH4aKL0t4GF10Er72Wd3Vm5hCwmthpp7RS6aOPwv77p20uvamNWf4cAlZTu++e9jC4//4UDNOnw667elMbs7w4BCwXBx6YNrW59VZ405vSpjbNzXDXXZ5wZlZLDgHLjQRHHJH2Pf7BD9KA8cSJcOih8OCDeVdnVgwOAcvdgAHwyU+mwePLLoMnn0wzj486ypvamPU1h4DVjUGD4LTT0mWl558P99yT1iSaMsWb2pj1FYeA1Z2hQ+Ff/zWFwYwZMGdO2tRmxgzwltNmvcshYHVrm23gkkvSshMnnAAzZ6Yrir7yFXj11byrM2sMDgGre6NHp8Xp2ja1OffcFAbf/jasXZt3dWb9W9UhIOnZbE/hhZJasra3SponaWl2Pzxrl6SZkpZJelzSntV+vxVH26Y28+ensYIzzkjdRNde601tzHqqt84EDomI8SV7Wp4J3B0R44C7s+cAk4Bx2W0acHkvfb8VyD77wN13p01tmprg5JNTKNxyi+cYmG2qvuoOOhK4Nnt8LTC5pP26SB4EtpK0XR/VYA3usMPg4YfhZz+Ddetg8mTYbz+47768KzPrP3ojBAL4paQFkqZlbSMiYiVAdv+2rH0k8ELJe1dkbWY9IsHRR6fxgquughdegIMPhkmT0jpFZlZZb4TA/hGxJ6mr51RJB1U4VmXaOpzAS5omqUVSS6uvCbRuGDgQPv3pNza1mT8f9twzLUexbFne1ZnVr6pDICJeyu5XAzcD+wCr2rp5svvV2eErgNElbx8FdNhqJCKujIjmiGhuamqqtkQrkPab2sydmza1mT7dm9qYlVNVCEh6s6Qt2x4DHwQWA3OBk7LDTgJuyR7PBU7MrhLaF3ilrdvIrDeVbmrzmc+kS0y9qY1ZR9WeCYwAfiPpMeAh4PaIuBP4GvABSUuBD2TPAe4AngaWAVcBn63y+80qatvU5qmnvKmNWTmKOr+mrrm5OVpaWvIuwxrEY4+lbqLbb4ftt4dzzklrE22+ed6VmfUeSQtKLtmvyDOGrVBKN7UZOzZ1Fe26K/zkJ97UxorJIWCF1H5Tm2OP9aY2VkwOASssb2pjBgPzLsAsb22b2hxzTJpwdv75aVObsWPT/IPNNkvHtN2XPq72tXr9rN74nje/Oe0RYfXNIWCWGTw4bWpz8snw3e/C4sVpYboNG9J96eP29+vXp6Ur1q6tfHxXn1Gprb91U0kwcmQK07FjYcyYjR+PGpXCwvLlEDBrZ+hQ+NKX8q6io4gUCH0VMr35GRs2wJo18Mwz8OyzaZe4F1/cOMgGDoQddugYDm2PR4xIZxXWtxwCZv2E9EaXS3+8pHXt2rS20zPPvBEObY9vuw1Wrdr4+De9CXbcsfxZxNixsPXW6Tex6jgEzKwmBg9Os7bf8Y7yr7/2WgqG0nBoezx/fseZ3kOHdh4QY8fCsGF9+uc0DIeAmdWFIUPSnI1ddy3/+iuvlA+IZ55J3U1/+cvGxw8f3vl4xJgx6fvMIWBm/cRb3pIm++2+e8fXIuCPf+wYDs8+mwb4b7ut41akI0Z0fhaxww7FubLJIWBm/Z4E22yTbnvv3fH1DRvSmEO5s4iHHkrblq5bt/HnFeXKJoeAmTW8zTaD7bZLt/326/j6unXp6qVy3U2NfmWTQ8DMCm/gwHQl0o47wvvf3/H1v/0Nnn++fHdTf7+yySFgZtaFQYO6vrLpuec6725as2bj4+vpyiaHgJlZlYYMSTvY7bJL+ddLr2xqfyZx773wv/+78fHDh8Nuu6VFDvuaQ8DMrI9158qm9uFQOlDdlxwCZmY5Kr2yqblb28D0rh6PX0saLeleSUskPSHp9Kz9XEkvSlqY3T5U8p6zJC2T9FtJh/fGH2BmZj1XzZnAOuDzEfFIttn8Aknzste+GRGXlB4saVfgWGA3YHvgvyTtHBHrq6jBzMyq0OMzgYhYGRGPZI9fBZYAIyu85Ujg+ohYGxHPkDab36en329mZtXrlekMksYAewDzs6bTJD0uabak4VnbSOCFkretoHJomJlZH6s6BCQNBW4EzoiIPwOXA28HxgMrgUvbDi3z9rLbZEiaJqlFUktra2u1JZqZWSeqCgFJm5MC4EcRcRNARKyKiPURsQG4ije6fFYAo0vePgp4qdznRsSVEdEcEc1NTU3VlGhmZhVUc3WQgFnAkoj4Rkn7diWHHQUszh7PBY6VNFjSWGAc8FBPv9/MzKpXzdVB+wMnAIskLczazgaOkzSe1NXzLPAZgIh4QtJPgSdJVxad6iuDzMzypajz3asltQLP5V1HJ7YB/pB3ERW4vuq4vuq4vupUU9+OEdGtvvS6D4F6JqklInKY49c9rq86rq86rq86taqvn6x4bWZmfcEhYGZWYA6B6lyZdwFdcH3VcX3VcX3VqUl9HhMwMyswnwmYmRWYQ6AHJD0raVG2VHZL3vUAZOs0rZa0uKTtrZLmSVqa3Q+v9Bk51NfpsuM1rq2zZdHr4vfrybLtOdT4JkkPSXosq/ErWftYSfOz3/AnkgbVWX3XSHqm5Dccn0d9WS0DJD0q6bbseU1+O4dAzx0SEePr6BKza4CJ7drOBO6OiHHA3dnzvFxDx/ogLTs+PrvdUeOa2rQti74LsC9warb0eb38fp3VB/Xx+wGsBQ6NiN1J64ZNlLQvcFFW4zjgZWBqndUH8MWS33Bh5x/R504nrcbcpia/nUOgQUTE/UC77aw5Erg2e3wtMLmmRZXopL66UGFZ9Lr4/XqwbHvNRdK2U+7m2S2AQ4EbsvY8f8PO6qsLkkYBHwauzp6LGv12DoGeCeCXkhZImpZ3MRWMiIiVkP5HArwt53rKKbfseG7aLYted79fN5dtz0XWnbEQWA3MA5YDf4qItt1yc10+vn19EdH2G16Y/YbflDQ4p/K+Bfw/YEP2fGtq9Ns5BHpm/4jYE5hEOjU/KO+C+qnOlh3PRZll0evKJizbnots9eDxpBWC9wF2KXdYbasq+eJ29Ul6N3AW8C5gb+CtwJdqXZekI4DVEbGgtLnMoX3y2zkEeiAiXsruVwM3U787pK1qW9U1u1+dcz0bqbDseM2VWxadOvr9NnHZ9lxFxJ+AX5HGL7aS1LZQZafLx9dSSX0Ts662iIi1wPfJ5zfcH/iIpGeB60ndQN+iRr+dQ2ATSXqz0p7KSHoz8EHeWC673swFTsoenwTckmMtHVRYdrzWdZRdFp06+f06q69efr+sliZJW2WPtwAOI41d3AscnR2W529Yrr6nSkJepD73mv+GEXFWRIyKiDGkfdjviYjjqdFv58lim0jSTqR//UNaivvHEXFhjiUBIGkOcDBp5cFVwDnAz4GfAjsAzwPHREQug7Od1HcwqSvj/5Ydb+uDr3FtBwC/BhbxRp/s2aR+99x/vwr1HUcd/H5Zje8hDV4OIP3j8qcRcV7238v1pK6WR4FPZv/qrpf67gGaSN0vC4FTSgaQa07SwcAXIuKIWv12DgEzswJzd5CZWYE5BMzMCswhYGZWYA4BM7MCcwiYmRWYQ8DMrMAcAmZmBeYQMDMrsP8PkTC60NVRXf4AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x22e328d4470>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同聚类数目下的性能，找到最佳模型／参数（分数最高）\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3、结果分析\n",
    "该数据集的词频特征有部分值较大，远大于标准差，在未对数据进行正规化normalize之前，聚类结果非常不稳定，最佳的K值经常在2-10之间随机出现，其产生原因推测为未正规化的部分特征数值较大，对于聚类模型相当于离群点，对收敛位置影响较大，导致CH索引结果变化大。\n",
    "   \n",
    "由于数据本身是稀疏的，因此可对数据进行正规化，处理后最优K值在K=2处较为稳定，但正规化处理后的CH索引分数似乎比之前要低，可能是因为正规化的过程改变了原本类间和类内的距离比值，而CH索引本身是度量的平方和，因此在聚类特征较为明显的情况下（即类间距离相对类内距离大很多的情况下），可能会降低CH索引分数。"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
